Code
# _common.qmd
# Shared setup file for Quarto project
# Handles imports, dataset loading, and global variables.
import os
import pandas as pd
import numpy as np
import plotly.express as px
# Load dataset safely
CSV_PATHS = ["data/lightcast_job_postings.csv" , "lightcast_job_postings.csv" ]
csv_path = next ((p for p in CSV_PATHS if os.path.exists(p)), None )
if not csv_path:
print ("⚠️ lightcast_job_postings.csv not found — continuing with empty DataFrame." )
df = pd.DataFrame()
else :
df = pd.read_csv(csv_path, low_memory= False )
print (f"✅ Dataset loaded: { csv_path} → { df. shape[0 ]} rows, { df. shape[1 ]} columns" )
# Optional cleanup or derived columns
if not df.empty:
if "SALARY" in df.columns:
df["SALARY" ] = pd.to_numeric(df["SALARY" ], errors= "coerce" )
df["SALARY_DISPLAY" ] = df["SALARY" ].fillna(df["SALARY" ].median())
elif "SALARY_DISPLAY" not in df.columns:
df["SALARY_DISPLAY" ] = np.nan
if "NAICS_2022_6_NAME" in df.columns:
df["INDUSTRY_DISPLAY" ] = df["NAICS_2022_6_NAME" ]
elif "INDUSTRY" in df.columns:
df["INDUSTRY_DISPLAY" ] = df["INDUSTRY" ]
else :
df["INDUSTRY_DISPLAY" ] = "Unknown"
✅ Dataset loaded: data/lightcast_job_postings.csv → 72498 rows, 131 columns
title: “Introduction: Group 5” subtitle: “About The Team” —
Team: Connor Coulter, Wei Wang, Balqis Bevi Abdul Hannan Kanaga
Topic: AI vs. Non-AI Job Growth — Is AI taking over or creating more jobs?
Course: AD 688
This site hosts our research rationale, intro, and literature review for Project Selection I, II and III.
Import Data
Code
import pandas as pd, numpy as np, os
# --- Load dataset safely ---
CSV_PATHS = ["data/lightcast_job_postings.csv" , "lightcast_job_postings.csv" ]
csv_path = next ((p for p in CSV_PATHS if os.path.exists(p)), None )
if not csv_path:
raise FileNotFoundError ("⚠️ lightcast_job_postings.csv not found" )
df = pd.read_csv(csv_path, low_memory= False )
print ("✅ Loaded dataset:" , df.shape)
# --- Ensure key derived columns exist ---
# Industry display
if "NAICS_2022_6_NAME" in df.columns:
df["INDUSTRY_DISPLAY" ] = df["NAICS_2022_6_NAME" ]
elif "INDUSTRY" in df.columns:
df["INDUSTRY_DISPLAY" ] = df["INDUSTRY" ]
else :
df["INDUSTRY_DISPLAY" ] = "Unknown"
# Salary display
salary_candidates = ["SALARY" ,"SALARY_MEDIAN" ,"SALARY_MID" ,"SALARY_ANNUAL" ,"PAY_RATE" ]
found_salary = next ((c for c in salary_candidates if c in df.columns), None )
if found_salary:
df["SALARY_DISPLAY" ] = pd.to_numeric(df[found_salary], errors= "coerce" )
df["SALARY_DISPLAY" ].fillna(df["SALARY_DISPLAY" ].median(), inplace= True )
else :
df["SALARY_DISPLAY" ] = np.nan
print ("Derived columns:" , {
"INDUSTRY_DISPLAY" : df["INDUSTRY_DISPLAY" ].notna().sum (),
"SALARY_DISPLAY" : df["SALARY_DISPLAY" ].notna().sum ()
})
✅ Loaded dataset: (72498, 131)
Derived columns: {'INDUSTRY_DISPLAY': np.int64(72454), 'SALARY_DISPLAY': np.int64(72498)}
print(“Derived non-null:”, { “INDUSTRY_DISPLAY”: df[“INDUSTRY_DISPLAY”].notna().sum(), “SALARY_DISPLAY”: df[“SALARY_DISPLAY”].notna().sum() })
::: {#013b4846 .cell execution_count=3}
``` {.python .cell-code}
# Data Cleaning & Preprocessing
# Drop Unnecessary Columns
columns_to_drop = [
"ID","LAST_UPDATED_TIMESTAMP","DUPLICATES","ACTIVE_URLS","ACTIVE_SOURCES_INFO",
"TITLE_RAW","BODY","COMPANY_RAW",
"NAICS2","NAICS2_NAME","NAICS3","NAICS3_NAME","NAICS4","NAICS4_NAME",
"NAICS5","NAICS5_NAME","NAICS6","NAICS6_NAME",
"NAICS_2022_2","NAICS_2022_2_NAME","NAICS_2022_3","NAICS_2022_3_NAME",
"NAICS_2022_4","NAICS_2022_4_NAME","NAICS_2022_5","NAICS_2022_5_NAME",
"SOC_2","SOC_2_NAME","SOC_3","SOC_3_NAME","SOC_5","SOC_5_NAME",
"CIP2","CIP2_NAME","CIP4","CIP4_NAME","CIP6","CIP6_NAME",
"LOT_CAREER_AREA","LOT_CAREER_AREA_NAME","LOT_OCCUPATION","LOT_OCCUPATION_NAME",
"LOT_SPECIALIZED_OCCUPATION","LOT_SPECIALIZED_OCCUPATION_NAME",
"LOT_OCCUPATION_GROUP","LOT_OCCUPATION_GROUP_NAME",
"LOT_V6_SPECIALIZED_OCCUPATION","LOT_V6_SPECIALIZED_OCCUPATION_NAME",
"LOT_V6_OCCUPATION","LOT_V6_OCCUPATION_NAME","LOT_V6_OCCUPATION_GROUP",
"LOT_V6_OCCUPATION_GROUP_NAME","LOT_V6_CAREER_AREA","LOT_V6_CAREER_AREA_NAME",
"ONET","ONET_NAME","ONET_2019","ONET_2019_NAME"
]
drop_existing = [c for c in columns_to_drop if c in df.columns]
df.drop(columns=drop_existing, inplace=True)
print("Remaining columns (first 30):", list(df.columns)[:30])
Remaining columns (first 30): ['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION']
:::
Code
# Handle Missing Values
import missingno as msno, matplotlib.pyplot as plt
msno.heatmap(df)
plt.title("Missing Values Heatmap" )
plt.show()
df.dropna(thresh= len (df) * 0.5 , axis= 1 , inplace= True )
if "SALARY_DISPLAY" in df.columns:
df["SALARY_DISPLAY" ].fillna(df["SALARY_DISPLAY" ].median(), inplace= True )
for col in df.select_dtypes(include= "object" ).columns:
df[col].fillna("Unknown" , inplace= True )
# Remove Duplicates
subset_cols = [c for c in ["TITLE" ,"COMPANY_NAME" ,"LOCATION" ,"POSTED" ] if c in df.columns]
if subset_cols:
before = len (df)
df.drop_duplicates(subset= subset_cols, keep= "first" , inplace= True )
print (f"Removed { before - len (df)} duplicates using { subset_cols} " )
# Exploratory Data Analysis (EDA)
# Job Postings by Industry (Top 15)
import plotly.express as px
counts = (
df["INDUSTRY_DISPLAY" ]
.value_counts(dropna= False )
.head(15 )
.reset_index(name= "Count" )
.rename(columns= {"index" : "Industry" })
.sort_values("Count" )
)
fig1 = px.bar(
counts, x= "Count" , y= "INDUSTRY_DISPLAY" , orientation= "h" ,
title= "Top 15 Industries by Number of Job Postings"
)
fig1.show()
Removed 3300 duplicates using ['TITLE', 'COMPANY_NAME', 'LOCATION', 'POSTED']
Code
# Salary Distribution by Industry (Top 15)
sdf = df[["INDUSTRY_DISPLAY" ,"SALARY_DISPLAY" ]].copy()
sdf = sdf.dropna()
sdf = sdf[sdf["SALARY_DISPLAY" ] > 0 ]
top_industries = sdf["INDUSTRY_DISPLAY" ].value_counts().head(15 ).index
sdf = sdf[sdf["INDUSTRY_DISPLAY" ].isin(top_industries)]
fig2 = px.box(
sdf, x= "INDUSTRY_DISPLAY" , y= "SALARY_DISPLAY" ,
title= "Salary Distribution by Industry (Top 15)" ,
points= False
)
fig2.update_layout(xaxis_tickangle=- 45 )
fig2.show()
# Remote vs. On-Site Jobs
if "REMOTE_TYPE_NAME" in df.columns:
rc = df["REMOTE_TYPE_NAME" ].value_counts().reset_index()
rc.columns = ["Remote Type" ,"Count" ]
fig3 = px.pie(
rc, names= "Remote Type" , values= "Count" ,
title= "Remote vs. On-Site Job Distribution"
)
fig3.show()
EDA: Rationale & Insights
Job Postings by Industry
Why: Highlights sectors where demand is concentrated, showing which industries are actively hiring. Key Insights: The top three industries by job postings are Temporary Help Services, Miscellaneous Ambulatory Health Care Services, and Semiconductor and Related Device Manufacturing.
Salary Distribution by Industry
Why: Shows where negotiation power exists and highlights industries paying well. Key Insights: Automotive Parts and Accessories Retailers show a wide range (negotiation potential), while Barber Shops show a narrow range (little negotiation).
Remote vs. On-Site Jobs
Why: Workplace flexibility is a major factor in today’s job market. Key Insights: Most postings (78.3%) don’t specify remote status. About 17% are remote, 3.1% hybrid, and 1.6% explicitly not remote.
Job Postings by Industry (Top 15)
Code
import plotly.io as pio
import plotly.express as px
pio.renderers.default = "vscode+notebook+png"
# Filter out "Unclassified Industry" before getting top 15
counts = (
df[df["INDUSTRY_DISPLAY" ] != "Unclassified Industry" ]
["INDUSTRY_DISPLAY" ]
.value_counts()
.head(15 )
.reset_index(name= "Count" )
.rename(columns= {"INDUSTRY_DISPLAY" : "Industry" })
)
fig1 = px.bar(
counts, x= "Count" , y= "Industry" , orientation= "h" ,
title= "Top 15 Industries by Number of Job Postings (Excluding Unclassified)"
)
fig1.show()
fig1.write_image("output/plot 1.svg" , width= 3500 , height= 600 , scale= 1 )
Rationale
Highlights sectors where demand is concentrated, showing which industries are actively hiring.
Key Insights
Top Hiring Industries: Custom Computer Programming, Management Consulting, and Employment Agencies dominate job postings.
Skewed Distribution: The top 4 industries account for a significantly larger share of job postings than the rest.
Professional Services Focus: Many high-posting sectors are centered around tech, consulting, healthcare and education – reflecting demand for knowledge-based roles.
Salary Distribution by Industry (Top 15)
Code
sdf = df[["INDUSTRY_DISPLAY" ,"SALARY_DISPLAY" ]].copy()
sdf = sdf.dropna()
sdf = sdf[sdf["SALARY_DISPLAY" ] > 0 ]
top_industries = sdf["INDUSTRY_DISPLAY" ].value_counts().head(15 ).index
sdf = sdf[sdf["INDUSTRY_DISPLAY" ].isin(top_industries)]
fig2 = px.box(
sdf, x= "INDUSTRY_DISPLAY" , y= "SALARY_DISPLAY" ,
title= "Salary Distribution by Industry (Top 15)" ,
points= False ,
labels= {
"INDUSTRY_DISPLAY" : "Industry" ,
"SALARY_DISPLAY" : "Salary"
}
)
fig2.update_layout(xaxis_tickangle=- 45 )
fig2.show()
fig2.write_image("output/plot 2.svg" , width= 3500 , height= 600 , scale= 1 )
Rationale
Shows where negotiation power exists and highlights industries paying well.
Key Insights
Wide Salary Ranges in Staffing & Tech Services: Industries like Temporary Help Services and Employment Placement Agencies exhibit large salary spreads with high outliers, though their median pay remains modest.
Stable Pay in Professional Sectors: Most industries maintain a consistent median salary around $100K-$150K, reflecting standardized compensation and less variation in negotiation power.
Remote vs. On-Site Jobs
Code
if "REMOTE_TYPE_NAME" in df.columns:
rc = df["REMOTE_TYPE_NAME" ].value_counts().reset_index()
rc.columns = ["Remote Type" , "Count" ]
# Updated custom color palette with cohesive blues
color_sequence = ["#636EFA" , "#4A90E2" , "#00CC96" , "#AB63FA" , "#274BDB" ]
fig3 = px.pie(
rc,
names= "Remote Type" ,
values= "Count" ,
title= "Remote vs. On-Site Job Distribution" ,
color_discrete_sequence= color_sequence
)
fig3.show()
fig3.write_image("output/plot 3.svg" , width= 3500 , height= 600 , scale= 1 )
Rationale
Workplace flexibility is a major factor in today’s job market.
Key Insights
Limited Remote Availability: Only about 17% of job postings are labeled as Remote, with Hybrid Remote and Not Remote making up even smaller portions.
Data Gaps in Job Listings: A significant 78.3% of postings lack remote classification, indicating either incomplete employer data or inconsistent labeling, which may affect job seekers’ filtering and selection.
Group 5 skill level
Code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
# 3.1.1 Group 5 skill level
skills_data = {
"Name" : ["Connor" , "Wei" , "Balqis" ],
"Python" : [2 , 1 , 3 ],
"SQL" : [2 , 2 , 4 ],
"Machine Learning" : [2 , 1 , 2 ],
"Cloud Computing" : [2 , 1 , 2 ]}
df_skills = pd.DataFrame(skills_data).set_index("Name" )
plt.figure(figsize= (6 ,4 ))
sns.heatmap(df_skills, annot= True , cmap= "PuBuGn" , linewidths= 0.5 )
plt.title("Group 5 Skill Level" )
plt.show()
Compare our group’s skills against job market demand
Code
# 3.1.2 Compare our group's skills against job market demand
top_skills = ["Python" , "SQL" , "Machine Learning" , "Cloud Computing" , "Docker" , "AWS" ]
job_skill_counts = Counter(top_skills)
for skill in top_skills:
if skill not in df_skills.columns:
df_skills[skill] = 0
df_skills
Name
Connor
2
2
2
2
0
0
Wei
1
2
1
1
0
0
Balqis
3
4
2
2
0
0
Improvement Plan
Balqis : Her Machine Learning and Cloud Computing are at a basic level, leaving room to grow. With a career in data analysis and visualization, Machine Learning isn’t her top priority, but Cloud Computing is worth developing further. Strengthening Python would also be valuable, as it’s essential for data analysts. A good approach is to sharpen her skills through small personal projects and apply what she learns at work. If her fundamentals feel solid, she can move towards certifications.
Wei : Her Python and Machine Learning are at a basic level, so she has the option to develop them further depending on how relevant they are to her career path. Since her SQL is already stronger, focusing on Python would be the most practical next step if she chooses to continue building technical skills. A good approach is to take it gradually through small projects and applied practice, and then expand into more advanced areas only if it fits her goals.
Connor : His skills are fairly even across all areas, at a basic stage, which gives him room to build depth. Bumping Python up to a stronger level would give him the most flexibility, while also continuing to grow in Cloud Computing to keep pace with current tools and workflows. A steady way forward is to practice Python through hands-on work and then bring in cloud tools as he becomes more confident.
Introduction
This project explores AI vs Non-AI careers using the lightcast_job_postings.csv dataset.
We apply clustering, regression, and classification to evaluate trends in job markets, with a focus on salary, experience, and employability .
The goal is to help job seekers understand how AI is shaping opportunities in 2024.
Analysis
Code
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
pio.templates.default = "plotly_white"
pio.renderers.default = "png"
Load dataset
Code
import os
CSV_PATHS = ["data/lightcast_job_postings.csv" , "lightcast_job_postings.csv" ]
csv_path = next ((p for p in CSV_PATHS if os.path.exists(p)), None )
if not csv_path:
raise FileNotFoundError ("⚠️ Dataset not found in data/ or project root" )
df = pd.read_csv(csv_path, low_memory= False )
print (f"✅ Loaded dataset: { csv_path} → { df. shape} " )
# Salary
if "SALARY" in df.columns:
df["salary" ] = pd.to_numeric(df["SALARY" ], errors= "coerce" )
else :
df["salary" ] = None
# Industry
if "NAICS_2022_6_NAME" in df.columns:
df["industry" ] = df["NAICS_2022_6_NAME" ]
else :
df["industry" ] = df.get("NAICS_2022_2_NAME" , "Unknown" )
# AI Flag
df["is_ai" ] = df["TITLE_NAME" ].str .contains(
"AI|Artificial Intelligence|Machine Learning|Deep Learning" ,
case= False , na= False
).astype(int )
# Experience
df["years_experience" ] = df[["MIN_YEARS_EXPERIENCE" ,"MAX_YEARS_EXPERIENCE" ]].mean(axis= 1 )
# Keep only rows with actual salary and years_experience data
print (f"Original dataset: { len (df)} rows" )
df = df.dropna(subset= ["salary" , "years_experience" ])
print (f"After removing missing salary and years_experience: { len (df)} rows" )
# Handle Missing Values
derived_cols = ["salary" , "industry" , "is_ai" , "years_experience" ]
cols_to_check = [col for col in df.columns if col not in derived_cols]
cols_to_drop = [col for col in cols_to_check if df[col].isna().sum () > len (df) * 0.5 ]
df.drop(columns= cols_to_drop, inplace= True )
# Fill missing values
for col in df.select_dtypes(include= "object" ).columns:
df[col].fillna("Unknown" , inplace= True )
# Preview table
preview = df[["TITLE_NAME" , "industry" , "salary" , "years_experience" , "is_ai" ]].head(10 ).copy()
preview["salary" ] = preview["salary" ].apply (lambda x: f"$ { x:,.0f} " if pd.notna(x) else "" )
preview["years_experience" ] = preview["years_experience" ].round (1 )
fig_preview = ff.create_table(preview, index= False )
fig_preview.update_layout(width= 2500 , height= 500 )
fig_preview.write_image("preview_table.png" )
#fig_preview.show()
✅ Loaded dataset: data/lightcast_job_postings.csv → (72498, 131)
Original dataset: 72498 rows
After removing missing salary and years_experience: 23697 rows
Code
df_model = df.dropna(subset= ["salary" ]).copy()
df_model = pd.get_dummies(df_model, columns= ["industry" ], drop_first= True )
X = df_model.drop(columns= ["TITLE_NAME" ,"salary" ,"is_ai" ])
X = X.select_dtypes(include= ["int64" ,"float64" ,"uint8" ])
y = df_model["salary" ]
imputer = SimpleImputer(strategy= "median" )
X_imputed = imputer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size= 0.3 , random_state= 42 )
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
# Bar chart of results
reg_results = pd.DataFrame({"Metric" : ["RMSE" ,"R²" ], "Value" : [rmse, r2]})
fig_reg = px.bar(reg_results, x= "Metric" , y= "Value" ,
title= "Regression Performance: Salary Prediction" ,
text_auto= True )
fig_reg.write_image("regression_performance.png" )
#fig_reg.show()
Classification: AI vs Non-AI Jobs
Code
df_clf = df.dropna(subset= ["salary" ]).copy()
df_clf = pd.get_dummies(df_clf, columns= ["industry" ], drop_first= True )
X = df_clf.drop(columns= ["TITLE_NAME" ,"salary" ,"is_ai" ])
X = X.select_dtypes(include= ["int64" ,"float64" ,"uint8" ])
y = df_clf["is_ai" ]
X_imputed = imputer.fit_transform(X)
if len (y.unique()) > 1 :
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size= 0.3 , random_state= 42 )
clf = LogisticRegression(max_iter= 1000 )
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, zero_division= 0 )
cm = confusion_matrix(y_test, y_pred)
# Performance bar chart
clf_results = pd.DataFrame({"Metric" : ["Accuracy" ,"F1 Score" ], "Value" : [acc, f1]})
fig_clf = px.bar(clf_results, x= "Metric" , y= "Value" ,
title= "Classification Performance: AI vs Non-AI" ,
text_auto= True )
#fig_clf.show()
# Confusion Matrix
fig_cm = px.imshow(cm, text_auto= True , title= "Confusion Matrix" ,
labels= dict (x= "Predicted" , y= "Actual" ))
fig_cm.show()
else :
print ("⚠️ Only one class found — cannot run classification." )
Clustering: Job Segmentation
Code
features = df[["salary" ,"years_experience" ]].dropna()
kmeans = KMeans(n_clusters= 3 , random_state= 42 , n_init= 10 )
features["cluster" ] = kmeans.fit_predict(features)
# Show sample table
fig_cluster_table = ff.create_table(features.head(10 ))
fig_cluster_table.show()
# Scatter plot
fig2 = px.scatter(features, x= "salary" , y= "years_experience" , color= "cluster" ,
title= "KMeans Clusters of Jobs (by Salary & Experience)" )
#fig2
Visualizations
Code
fig1 = px.histogram(df, x= "salary" , color= "is_ai" ,
barmode= "overlay" ,
title= "Salary Distribution: AI vs Non-AI Jobs" )
fig1.write_image("salary_distribution.png" )
fig1.show()
Insights for Job Seekers
AI roles often cluster at higher salaries compared to non-AI roles.
Experience remains critical — higher years of experience align with higher pay clusters.
Industries with strong AI adoption (e.g., tech, finance) show clearer salary advantages.
Takeaways:
Highlight AI-related skills to access higher-paying roles.
Leverage industry trends to target fields with high AI adoption.
Use clustering insights to understand where your profile fits (AI-heavy vs. traditional roles).
1️ Load the Dataset
Code
from pyspark.sql import SparkSession
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
import plotly.express as px
spark = SparkSession.builder.appName("LightcastData" ).getOrCreate()
df = (
spark.read.option("header" , "true" )
.option("inferSchema" , "true" )
.option("multiLine" , "true" )
.option("escape" , " \" " )
.csv("data/lightcast_job_postings.csv" )
)
df.show(5 )
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 16:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| NULL|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 2| 2| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 13:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| NULL|Oracle Consultant...|Oracle Consultant...| 8/1/2024| NULL| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| 3| false| NULL| 1| Remote| NULL| NULL| NULL|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 16:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8|39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 16:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| NULL|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10|37615159| Wells Fargo|Wells Fargo| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 03:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| NULL|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 3|Part-time / full-...| NULL| NULL| false| 92500| 0| [None]| year| 150000| 35000|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099|Stanislaus, CA|33700| Modesto, CA| 6|California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
2. Feature Engineering
Code
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
# Drop missing values and create new feature
df = df.dropna(subset= ["SALARY" ,"MIN_YEARS_EXPERIENCE" ,"EMPLOYMENT_TYPE_NAME" ,"NAICS_2022_2_NAME" ])
df = df.withColumn("MIN_YEARS_EXPERIENCE_SQ" , F.col("MIN_YEARS_EXPERIENCE" )** 2 )
# Encode categorical features
indexers = [
StringIndexer(inputCol= "NAICS_2022_2_NAME" , outputCol= "naics_idx" ),
StringIndexer(inputCol= "EMPLOYMENT_TYPE_NAME" , outputCol= "emp_idx" )
]
encoders = [OneHotEncoder(inputCols= ["naics_idx" ,"emp_idx" ], outputCols= ["naics_vec" ,"emp_vec" ])]
# Assemble features into one vector
assembler = VectorAssembler(
inputCols= ["MIN_YEARS_EXPERIENCE" ,"MIN_YEARS_EXPERIENCE_SQ" ,"SALARY_FROM" ,"SALARY_TO" ,"naics_vec" ,"emp_vec" ],
outputCol= "features"
)
pipeline = Pipeline(stages= indexers+ encoders+ [assembler])
model = pipeline.fit(df)
data = model.transform(df)
data.select("features" ,"SALARY" ).show(5 )
+--------------------+------+
| features|SALARY|
+--------------------+------+
|(26,[0,1,2,3,4,24...| 92962|
|(26,[0,1,2,3,11,2...|107645|
|(26,[0,1,2,3,4,24...|192800|
|(26,[0,1,2,3,4,24...|125900|
|(26,[0,1,2,3,7,24...|170000|
+--------------------+------+
only showing top 5 rows
##3️ Train/Test Split
Code
train, test = data.randomSplit([0.8 , 0.2 ], seed= 42 )
print ("Training rows:" , train.count(), "| Test rows:" , test.count())
Training rows: 18966 | Test rows: 4731
##4️ Linear Regression
Code
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol= "features" , labelCol= "SALARY" )
lr_model = lr.fit(train)
pred_lr = lr_model.transform(test)
print ("Linear Regression Results:" )
print ("R²:" , lr_model.summary.r2)
print ("RMSE:" , lr_model.summary.rootMeanSquaredError)
Linear Regression Results:
R²: 0.9990706054652662
RMSE: 1325.86711059027
###5 Polynomial Regression
Code
poly_lr = LinearRegression(featuresCol= "features" , labelCol= "SALARY" )
poly_model = poly_lr.fit(train)
pred_poly = poly_model.transform(test)
print ("Polynomial Regression Results:" )
print ("R²:" , poly_model.summary.r2)
print ("RMSE:" , poly_model.summary.rootMeanSquaredError)
Polynomial Regression Results:
R²: 0.9990706054652662
RMSE: 1325.86711059027
##6️ Random Forest Regressor
Code
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol= "features" , labelCol= "SALARY" , numTrees= 200 , maxDepth= 8 , seed= 42 )
rf_model = rf.fit(train)
pred_rf = rf_model.transform(test)
importance = pd.DataFrame({
"Feature" : assembler.getInputCols(),
"Importance" : rf_model.featureImportances.toArray()[:len (assembler.getInputCols())]
}).sort_values("Importance" , ascending= False )
importance.head(10 )
3
SALARY_TO
0.555946
2
SALARY_FROM
0.324545
0
MIN_YEARS_EXPERIENCE
0.058599
1
MIN_YEARS_EXPERIENCE_SQ
0.040476
4
naics_vec
0.006524
5
emp_vec
0.000508
##7️ Compare Models
Code
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol= "SALARY" , predictionCol= "prediction" , metricName= "rmse" )
metrics = pd.DataFrame({
"Model" : ["Linear Regression" ,"Polynomial Regression" ,"Random Forest" ],
"RMSE" : [
evaluator.evaluate(pred_lr),
evaluator.evaluate(pred_poly),
evaluator.evaluate(pred_rf)
],
"R2" : [
lr_model.summary.r2,
poly_model.summary.r2,
None
]
})
metrics
0
Linear Regression
1429.570492
0.999071
1
Polynomial Regression
1429.570492
0.999071
2
Random Forest
4495.386677
NaN